Here we load all packages we will be needing, create the folders we will be working on, define functions and read input files (annotations, counts and sample annotation)
library(ggplot2)
library(RColorBrewer)
library(data.table)
library(writexl)
library(ggpubr)
library(DESeq2)
library(pheatmap)
library(stringr)
library(rjson)
library(Hmisc)
path <- "/Volumes/My Passport/hd_in/02.08.20_hd/"
dir.create(paste(path, "tables", sep=""))
## Warning in dir.create(paste(path, "tables", sep = "")): '/Volumes/My Passport/
## hd_in/02.08.20_hd/tables' already exists
dir.create(paste(path, "plots", sep=""))
## Warning in dir.create(paste(path, "plots", sep = "")): '/Volumes/My Passport/
## hd_in/02.08.20_hd/plots' already exists
dir.create(paste(path, "GO_analysis", sep=""))
## Warning in dir.create(paste(path, "GO_analysis", sep = "")): '/Volumes/My
## Passport/hd_in/02.08.20_hd/GO_analysis' already exists
more_10 <- function(row){
return(length(which(row > 10)))
}
t.test_row <- function(row, coldata, var, condition_base, condition_compare){
row <- as.data.frame(row)
colnames(row) <- row['gene_id',]
row <- row[coldata$Column,,drop=F]
row <- merge(row, coldata[, var, drop=F], by='row.names')
row[,2] <- as.numeric(as.character(row[,2]))
mean_base <- mean(subset(row, row[,var] == condition_base)[, 2])
mean_compare <- mean(subset(row, row[,var] == condition_compare)[, 2])
gene_id <- colnames(row)[2]
colnames(row) <- c('sample', 'expression', 'condition')
row$expression <- as.numeric(as.character(row$expression))
if(sum(row$expression) > 0){
if(sum(subset(row, row$condition == condition_base)$expression) > 0){
normality_base <- shapiro.test(subset(row, row$condition == condition_base)$expression)
}
else{
normality_base <- data.frame(p.value=NA)
}
if(sum(subset(row, row$condition == condition_compare)$expression) > 0){
normality_compare <- shapiro.test(subset(row, row$condition == condition_compare)$expression)
}
else{
normality_compare <- data.frame(p.value=NA)
}
row$condition <- factor(row$condition, levels=c(condition_compare, condition_base))
test <- t.test(expression~condition, data=row, paired=F, exact=F)
results <- c(gene_id, mean_base, mean_compare, log2((mean_compare/mean_base)), normality_base$p.value, normality_compare$p.value, test$statistic, test$p.value, test$conf.int)
}
else{
results <- c(gene_id, mean_base, mean_compare, log2((mean_compare/mean_base)), NA, NA, NA,NA, NA, NA)
}
names(results) <- c("Gene id", paste("Mean", condition_base), paste("Mean", condition_compare), "Log2FC", paste('Shapiro Pvalue -', condition_base), paste('Shapiro Pvalue -', condition_compare), 'T', 'Pvalue', 'Low conf int', 'High conf int')
return(results)
}
gene_transcript <- fread('/Volumes/My Passport/annotations/human/gencode/v30/gencode.v30.annotation.transcript.id.name.type.tab', data.table=F, header=F)
colnames(gene_transcript) <- c('transcript_id', 'gene_id', 'gene_name', 'gene_type')
coldata <- fread('/Volumes/My Passport/hd_in/09.19_hd/CategoricalAnnotation.txt', data.table = F)
rownames(coldata) <- coldata$Column
coldata$Sample <- as.character(coldata$Sample)
coldata$Column <- as.character(coldata$Column)
rna <- fread('/Volumes/My Passport/hd_in/02.08.20_hd/3_stdmapping/1_readcounts/gene_count_matrix_0_p.csv', data.table=F)
rownames(rna) <- rna$Geneid
colnames(rna)[7:ncol(rna)] <- sapply(str_split(colnames(rna)[7:ncol(rna)], "/"), `[[`, 5)
The first thing we want to check is how similar or different fibroblasts and iNs really are. Here we produce a PCA plot and perform differential expression analysis with DESeq2.
# Filter out genes with less than 10 reads in one sample
rna_expressed <- rna[which(apply(rna[,coldata$Column], 1, more_10) > 0), coldata$Column]
dds_rna <- DESeqDataSetFromMatrix(rna_expressed[,coldata$Column], coldata, design= ~Stage)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_rna$Stage <- relevel(dds_rna$Stage, 'CTRL')
dds_rna <- DESeq(dds_rna)
rna_expressed_norm <- as.data.frame(counts(dds_rna, normalize=T))
vst_rna <- varianceStabilizingTransformation(dds_rna)
pca <- plotPCA(vst_rna, intgroup=c("CellType", "Stage"))
pca <- pca + theme_classic() + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + labs(colour="") + ggtitle("PCA FB vs iN", subtitle = "RNA-seq") + theme(plot.title = element_text(hjust = 0.5, face="bold"), plot.subtitle = element_text(hjust = 0.5))
pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/SuppFig2a.pdf")
pca
dev.off()
## quartz_off_screen
## 2
ggsave(filename = "/Volumes/My Passport/hd_in/02.08.20_hd/plots/SuppFig2a.svg", device = "svg", plot = pca)
pca
To increase sensitivity (and aware of this approaches’ limitations), we then used the median-of-ratios normalized counts (produced with DESeq2). We will be using our function t.test_row defined in the settings section. We are going to iterate over rna_expressed_norm using apply (on rows - 1), passing the variable we are testing, setting iNs as the reference or base condition.
rna_expressed_norm$gene_id <- rownames(rna_expressed_norm)
# iNs as reference condition
rna_norm_test2 <- apply(rna_expressed_norm, 1, FUN=t.test_row, coldata, "CellType", "IN", "FB")
rna_norm_test2 <- as.data.frame(t(rna_norm_test2))
rna_norm_test2$`Gene id` <- as.character(rna_norm_test2$`Gene id`)
rna_norm_test2$`Mean IN` <- as.numeric(as.character(rna_norm_test2$`Mean IN`))
rna_norm_test2$`Mean FB` <- as.numeric(as.character(rna_norm_test2$`Mean FB`))
rna_norm_test2$`Log2FC` <- as.numeric(as.character(rna_norm_test2$`Log2FC`))
rna_norm_test2$`Shapiro Pvalue - IN` <- as.numeric(as.character(rna_norm_test2$`Shapiro Pvalue - IN`))
rna_norm_test2$`Shapiro Pvalue - FB` <- as.numeric(as.character(rna_norm_test2$`Shapiro Pvalue - FB`))
rna_norm_test2$T <- as.numeric(as.character(rna_norm_test2$T))
rna_norm_test2$Pvalue <- as.numeric(as.character(rna_norm_test2$Pvalue))
rna_norm_test2$`Low conf int` <- as.numeric(as.character(rna_norm_test2$`Low conf int`))
rna_norm_test2$`High conf int` <- as.numeric(as.character(rna_norm_test2$`High conf int`))
# Filter out genes that are not normally distributed
rna_norm_test2 <- rna_norm_test2[which(rna_norm_test2$`Shapiro Pvalue - IN` > 0.05 & rna_norm_test2$`Shapiro Pvalue - FB` > 0.05),]
# Get the counts of the ones that are normally distributed
rna_expressed_norm <- subset(rna_expressed_norm, rna_expressed_norm$gene_id %in% rna_norm_test2$`Gene id`)
# Which one of those is significantly different?
rna_signdiff <- rna_norm_test2[which(rna_norm_test2$Pvalue < 0.05),]
# Tag them in one
rna_norm_test2$type <- ifelse(rna_norm_test2$Pvalue < 0.05 & rna_norm_test2$Log2FC < 0, 'Downregulated',
ifelse(rna_norm_test2$Pvalue < 0.05 & rna_norm_test2$Log2FC > 0, 'Upregulated', 'Not significant'))
rna_norm_test2$colours <- ifelse(rna_norm_test2$type == 'Downregulated', 'steelblue',
ifelse(rna_norm_test2$type == 'Upregulated', 'tomato3', 'black'))
title <- 'RNA expression FB vs iN'
subtitle <- 'p-value < 0.05; Log2FC > 0'
pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig1d.pdf")
plot(log2(rna_norm_test2$`Mean IN`+0.5),
log2(rna_norm_test2$`Mean FB`+0.5),
col=rna_norm_test2$colours,
cex=0.5,
pch=16,
xlab='log2(mean iN)',
ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_test2$type)["Upregulated"]),")",sep=""),
paste("Down (",as.numeric(table(rna_norm_test2$type)["Downregulated"]),")",sep = ""),
paste("Not significant (",as.numeric(table(rna_norm_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen
## 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig1d.svg")
plot(log2(rna_norm_test2$`Mean IN`+0.5),
log2(rna_norm_test2$`Mean FB`+0.5),
col=rna_norm_test2$colours,
cex=0.5,
pch=16,
xlab='log2(mean iN)',
ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_test2$type)["Upregulated"]),")",sep=""),
paste("Down (",as.numeric(table(rna_norm_test2$type)["Downregulated"]),")",sep = ""),
paste("Not significant (",as.numeric(table(rna_norm_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen
## 2
plot(log2(rna_norm_test2$`Mean IN`+0.5),
log2(rna_norm_test2$`Mean FB`+0.5),
col=rna_norm_test2$colours,
cex=0.5,
pch=16,
xlab='log2(mean iN)',
ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_test2$type)["Upregulated"]),")",sep=""),
paste("Down (",as.numeric(table(rna_norm_test2$type)["Downregulated"]),")",sep = ""),
paste("Not significant (",as.numeric(table(rna_norm_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
We then write these results to excels (up/downregulated genes).
# Significantly different
rna_norm_test_sign <- subset(rna_norm_test2, rna_norm_test2$type != 'Not significant')
# Up and down regulated for PANTHER
rna_norm_test_upreg <- merge(rna_norm_test2[which(rna_norm_test2$type == 'Upregulated'),], unique(gene_transcript[,c("gene_id", "gene_name")]), by.x='row.names', by.y='gene_id')
# write.table(rna_norm_test_upreg$gene_name, quote=F, col.names = F, row.names = F,
# file=paste(path, "GO_analysis/rna_upreg_fb.in.tab", sep=''))
rna_norm_test_dwnreg <- merge(rna_norm_test2[which(rna_norm_test2$type == 'Downregulated'),], unique(gene_transcript[,c("gene_id", "gene_name")]), by.x='row.names', by.y='gene_id')
# write.table(rna_norm_test_dwnreg$gene_name, quote=F, col.names = F, row.names = F,
# file=paste(path, 'GO_analysis/rna_dwnreg_fb.in.tab', sep=''))
rna_norm_test2 <- merge(rna_norm_test2, unique(gene_transcript[,c('gene_id', 'gene_name')]), by.x='Gene id', by.y='gene_id')
# write.table(rna_norm_test2$gene_name, quote=F, col.names = F, row.names = F,
# file=paste(path, 'GO_analysis/rna_expressed.tab', sep=''))
to_file <- c("Gene", "Mean IN", "Mean FB", "Log2FC", "Shapiro Pvalue - IN",
"Shapiro Pvalue - FB", "T", "Pvalue", "Low conf int", "High conf int", "type")
rna_norm_test_upreg_toxlsx <- rna_norm_test_upreg
colnames(rna_norm_test_upreg_toxlsx) <- c("gene_id","Gene id","Mean IN","Mean FB",
"Log2FC","Shapiro Pvalue - IN",
"Shapiro Pvalue - FB","T","Pvalue","Low conf int","High conf int","type",
"colours","Gene")
rna_norm_in_test_dwnreg_toxlsx <- rna_norm_test_dwnreg
colnames(rna_norm_in_test_dwnreg_toxlsx) <- c("gene_id","Gene id","Mean IN","Mean FB",
"Log2FC","Shapiro Pvalue - IN",
"Shapiro Pvalue - FB","T","Pvalue","Low conf int","High conf int","type",
"colours","Gene")
writexl::write_xlsx(x = rna_norm_in_test_dwnreg_toxlsx[,to_file],
path = paste(path, "tables/STable1.xlsx", sep=''))
writexl::write_xlsx(x = rna_norm_test_upreg_toxlsx[,to_file],
path = paste(path, "tables/STable2.xlsx", sep=''))
Here is the head of the results of the upregulated genes in FB (RNAseq).
head(rna_norm_test_upreg_toxlsx)[,to_file]
## Gene Mean IN Mean FB Log2FC Shapiro Pvalue - IN Shapiro Pvalue - FB
## 1 NIPAL3 1605.74700 5610.0359 1.8047654 0.30646921 0.33864191
## 2 LAS1L 787.53561 1646.0172 1.0635623 0.86055185 0.86053087
## 3 ANKIB1 218.37629 391.5448 0.8423610 0.83233391 0.99852408
## 4 LASP1 909.36091 2085.0847 1.1971811 0.08945971 0.33838099
## 5 CASP10 89.61925 170.5543 0.9283503 0.05511672 0.14089474
## 6 CFLAR 624.92363 838.2318 0.4236694 0.77909481 0.07746327
## T Pvalue Low conf int High conf int type
## 1 9.014113 3.188930e-07 3051.995276 4956.5824 Upregulated
## 2 6.730765 6.924344e-06 586.500913 1130.4622 Upregulated
## 3 4.507151 2.455276e-04 92.709934 253.6270 Upregulated
## 4 7.222674 6.298002e-07 835.626812 1515.8208 Upregulated
## 5 2.285472 3.473975e-02 6.490651 155.3794 Upregulated
## 6 2.605002 1.569544e-02 44.092545 382.5238 Upregulated
Here is the head of the results of the upregulated genes in iNs (RNAseq).
head(rna_norm_in_test_dwnreg_toxlsx)[,to_file]
## Gene Mean IN Mean FB Log2FC Shapiro Pvalue - IN
## 1 NFYA 939.30041 460.89145 -1.0271596 0.86861377
## 2 RAD52 89.76432 56.79356 -0.6604146 0.44278479
## 3 CYP26B1 269.22279 10.24049 -4.7164438 0.44357831
## 4 SARM1 42.58264 11.65266 -1.8696065 0.47532142
## 5 RBM6 953.45573 787.59412 -0.2757136 0.05053354
## 6 SLC25A13 437.87373 181.81972 -1.2680062 0.13649640
## Shapiro Pvalue - FB T Pvalue Low conf int High conf int
## 1 0.4873177 -7.567975 7.294872e-07 -611.69332 -345.124605
## 2 0.3372384 -2.792654 1.127035e-02 -57.60544 -8.336064
## 3 0.6275317 -9.726111 2.257097e-07 -316.44268 -201.521931
## 4 0.4632906 -9.276284 5.703009e-09 -37.85400 -24.005971
## 5 0.2291887 -2.162984 4.034898e-02 -323.82595 -7.897263
## 6 0.9789090 -7.991571 2.351713e-07 -323.32999 -188.778031
## type
## 1 Downregulated
## 2 Downregulated
## 3 Downregulated
## 4 Downregulated
## 5 Downregulated
## 6 Downregulated
Now that we know how different iNs and fibroblasts are, we went ahead and tested for differences between conditions (HD vs control). Here we perform DEA with DESeq2 and produce a PCA plot.
coldata_fb <- subset(coldata, coldata$CellType == 'FB')
# Filter out genes with less than 10 reads in one sample
rna_expressed_fb <- rna[which(apply(rna[,coldata_fb$Column], 1, more_10) > 0), coldata_fb$Column]
# DESeq DEA and normalizing with median of ratios
dds_fb <- DESeqDataSetFromMatrix(rna_expressed_fb[,coldata_fb$Column], coldata_fb, design= ~Stage)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_fb$Stage <- relevel(dds_fb$Stage, 'CTRL')
dds_fb <- DESeq(dds_fb)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 108 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res_fb <- results(dds_fb)
vst_fb <- varianceStabilizingTransformation(dds_fb)
plotPCA(vst_fb, intgroup="Stage")
Same as in the first section, we now test the difference of expression of each gene (row) in the normalized counts data frame between HD and control using unpaired t.tests.
rna_norm_fb <- as.data.frame(counts(dds_fb, normalize=T))
rna_norm_fb$gene_id <- rownames(rna_norm_fb)
rna_norm_fb_test2 <- apply(rna_norm_fb, 1, FUN=t.test_row, coldata, "Stage", "CTRL", "HD")
rna_norm_fb_test2 <- as.data.frame(t(rna_norm_fb_test2))
rna_norm_fb_test2$`Gene id` <- as.character(rna_norm_fb_test2$`Gene id`)
rna_norm_fb_test2$`Mean CTRL` <- as.numeric(as.character(rna_norm_fb_test2$`Mean CTRL`))
rna_norm_fb_test2$`Mean HD` <- as.numeric(as.character(rna_norm_fb_test2$`Mean HD`))
rna_norm_fb_test2$`Log2FC` <- as.numeric(as.character(rna_norm_fb_test2$`Log2FC`))
rna_norm_fb_test2$`Shapiro Pvalue - HD` <- as.numeric(as.character(rna_norm_fb_test2$`Shapiro Pvalue - HD`))
rna_norm_fb_test2$`Shapiro Pvalue - CTRL` <- as.numeric(as.character(rna_norm_fb_test2$`Shapiro Pvalue - CTRL`))
rna_norm_fb_test2$T <- as.numeric(as.character(rna_norm_fb_test2$T))
rna_norm_fb_test2$Pvalue <- as.numeric(as.character(rna_norm_fb_test2$Pvalue))
rna_norm_fb_test2$`Low conf int` <- as.numeric(as.character(rna_norm_fb_test2$`Low conf int`))
rna_norm_fb_test2$`High conf int` <- as.numeric(as.character(rna_norm_fb_test2$`High conf int`))
# Filter out genes that are not normally distributed
# rna_norm_fb_test2 <- rna_norm_fb_test2[which(rna_norm_fb_test2$`Shapiro Pvalue - HD` > 0.05 & rna_norm_fb_test2$`Shapiro Pvalue - Control` > 0.05),]
rna_norm_fb_test2 <- rna_norm_fb_test2[which(rna_norm_fb_test2$`Shapiro Pvalue - HD` > 0.05 & rna_norm_fb_test2$`Shapiro Pvalue - CTRL` > 0.05),]
# Get the counts of the ones that are normally distributed
rna_norm_fb <- subset(rna_norm_fb, rna_norm_fb$gene_id %in% rna_norm_fb_test2$`Gene id`)
# Which one of those is significantly different?
rna_signdiff_fb <- rna_norm_fb_test2[which(rna_norm_fb_test2$Pvalue < 0.05),]
# Tag them in one
rna_norm_fb_test2$type <- ifelse(rna_norm_fb_test2$Pvalue < 0.05 & rna_norm_fb_test2$Log2FC < 0, 'Downregulated',
ifelse(rna_norm_fb_test2$Pvalue < 0.05 & rna_norm_fb_test2$Log2FC > 0, 'Upregulated', 'Not significant'))
# table(rna_norm_fb_test2$type)
# Put them colorrsss
rna_norm_fb_test2$colours <- ifelse(rna_norm_fb_test2$type == 'Downregulated', 'steelblue',
ifelse(rna_norm_fb_test2$type == 'Upregulated', 'tomato3', 'black'))
# Size of the points for the mean plot
rna_norm_fb_test2$cexs <- ifelse(rownames(rna_norm_fb_test2) %in% rownames(rna_signdiff_fb) , 1, 0.5)
plot(log2(rna_norm_fb_test2$`Mean CTRL`+0.5),
log2(rna_norm_fb_test2$`Mean HD`+0.5),
col=rna_norm_fb_test2$colours,
cex=rna_norm_fb_test2$cexs,
pch=16,
xlab='log2(mean Control)',
ylab='log2(mean HD)',
main='RNA FB - HD vs Control (p-value < 0.05; |log2FC| > 0)')
legend("bottomright", legend = c(paste("up (",as.numeric(table(rna_norm_fb_test2$type)["Upregulated"]),")",sep=""),
paste("down (",as.numeric(table(rna_norm_fb_test2$type)["Downregulated"]),")",sep = ""),
paste("not significant (",as.numeric(table(rna_norm_fb_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
Now we write the up/downregulated genes to excel files
# Significantly different
rna_norm_fb_test2_signdiff <- subset(rna_norm_fb_test2, rna_norm_fb_test2$type != 'Not significant')
# Up and down regulated for PANTHER
rna_norm_fb_test2_upreg <- merge(rna_norm_fb_test2_signdiff[which(rna_norm_fb_test2_signdiff$type == 'Upregulated'),], unique(gene_transcript[,c(2,3)]), by.x='row.names', by.y='gene_id')
# write.table(rna_norm_fb_test2_upreg$Gene, quote=F, col.names = F, row.names = F,
# file=paste(path, 'GO_analysis/fb_rna_hd.ctrl_upreg.tab', sep=''))
rna_norm_fb_test2_dwnreg <- merge(rna_norm_fb_test2_signdiff[which(rna_norm_fb_test2_signdiff$type == 'Downregulated'),], unique(gene_transcript[,c(2,3)]), by.x='row.names', by.y='gene_id')
# write.table(rna_norm_fb_test2_dwnreg$Gene, quote=F, col.names = F, row.names = F,
# file=paste(path, 'GO_analysis/fb_rna_hd.ctrl_dwnreg.tab', sep=''))
rna_norm_fb_test2 <- merge(rna_norm_fb_test2, unique(gene_transcript[,c('gene_id', 'gene_name')]), by.x='Gene id', by.y='gene_id')
# write.table(rna_norm_fb_test2$gene_name, quote=F, col.names = F, row.names = F,
# file=paste(path, 'GO_analysis/fb_rna_hd.ctrl.tab', sep=''))
Here we produce the PCA plot and normalized counts for iNs (highlighting HD vs control in PCA).
coldata_in <- subset(coldata, coldata$CellType == 'IN')
# DESeq DEA and normalizing with median of ratios
dds_in <- DESeqDataSetFromMatrix(rna[,coldata_in$Column], coldata_in, design= ~Stage)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_in$Stage <- relevel(dds_in$Stage, 'CTRL')
dds_in <- DESeq(dds_in)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 141 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res_in <- results(dds_in)
vst_in <- varianceStabilizingTransformation(dds_in)
plotPCA(vst_in, intgroup="Stage")
Here is a heatmap of authophagy related genes.
rna_norm_in <- as.data.frame(counts(dds_in, normalize=T))
rna_norm_in$gene_id <- rownames(rna_norm_in)
autophagy <- fread('/Volumes/My Passport/hd_in/autophagy_gene_list.txt', header = F, data.table = F)$V1
autophagy_rna_norm_in_control <- merge(rna_norm_in, unique(gene_transcript[,c(2,3)]), by='gene_id')
autophagy_rna_norm_in_control <- autophagy_rna_norm_in_control[which(autophagy_rna_norm_in_control$gene_name %in% autophagy), ]
rownames(autophagy_rna_norm_in_control) <- make.unique(autophagy_rna_norm_in_control$gene_name)
colnames(autophagy_rna_norm_in_control)[which(colnames(autophagy_rna_norm_in_control) %in% coldata$Sample)] <- coldata[colnames(autophagy_rna_norm_in_control)[which(colnames(autophagy_rna_norm_in_control) %in% coldata$Sample)], "RealName"]
autophagy_rna_norm_in_control_colannot <- coldata[which(coldata$CellType == 'IN' & coldata$Stage == "CTRL"), c("AgeContinuous", "RealName", "Column"), drop=F]
rownames(autophagy_rna_norm_in_control_colannot) <- autophagy_rna_norm_in_control_colannot$Column
autophagy_rna_norm_in_control <- autophagy_rna_norm_in_control[,subset(coldata, coldata$CellType == 'IN' & coldata$Stage == "CTRL")$Column]
pheatmap(log2(autophagy_rna_norm_in_control[which(rowSums(autophagy_rna_norm_in_control) > 0),]+0.5),
scale='row',
annotation_col = autophagy_rna_norm_in_control_colannot[,"AgeContinuous", drop=F],
fontsize = 7)
Now we test using iN’s normalized counts per gene/row between HD and control.
rna_norm_in_test2 <- apply(rna_norm_in, 1, FUN=t.test_row, coldata, "Stage", "CTRL", "HD")
rna_norm_in_test2 <- as.data.frame(t(rna_norm_in_test2))
rna_norm_in_test2$`Gene id` <- as.character(rna_norm_in_test2$`Gene id`)
rna_norm_in_test2$`Mean CTRL` <- as.numeric(as.character(rna_norm_in_test2$`Mean CTRL`))
rna_norm_in_test2$`Mean HD` <- as.numeric(as.character(rna_norm_in_test2$`Mean HD`))
rna_norm_in_test2$`Log2FC` <- as.numeric(as.character(rna_norm_in_test2$`Log2FC`))
rna_norm_in_test2$`Shapiro Pvalue - HD` <- as.numeric(as.character(rna_norm_in_test2$`Shapiro Pvalue - HD`))
rna_norm_in_test2$`Shapiro Pvalue - CTRL` <- as.numeric(as.character(rna_norm_in_test2$`Shapiro Pvalue - CTRL`))
rna_norm_in_test2$T <- as.numeric(as.character(rna_norm_in_test2$T))
rna_norm_in_test2$Pvalue <- as.numeric(as.character(rna_norm_in_test2$Pvalue))
rna_norm_in_test2$`Low conf int` <- as.numeric(as.character(rna_norm_in_test2$`Low conf int`))
rna_norm_in_test2$`High conf int` <- as.numeric(as.character(rna_norm_in_test2$`High conf int`))
# Filter out genes that are not normally distributed
rna_norm_in_test2 <- rna_norm_in_test2[which(rna_norm_in_test2$`Shapiro Pvalue - HD` > 0.05 & rna_norm_in_test2$`Shapiro Pvalue - CTRL` > 0.05),]
# Get the counts of the ones that are normally distributed
rna_norm_in <- subset(rna_norm_in, rna_norm_in$gene_id %in% rna_norm_in_test2$`Gene id`)
# Calculate the log2(mean per condition + 0.5)
# Which one of those is significantly different?
# Tag them in one
rna_norm_in_test2$type <- ifelse(rna_norm_in_test2$Pvalue < 0.05 & rna_norm_in_test2$Log2FC < 0, 'Downregulated',
ifelse(rna_norm_in_test2$Pvalue < 0.05 & rna_norm_in_test2$Log2FC > 0, 'Upregulated', 'Not significant'))
rna_norm_in_test2$colours <- ifelse(rna_norm_in_test2$type == 'Downregulated', 'steelblue',
ifelse(rna_norm_in_test2$type == 'Upregulated', 'tomato3', 'black'))
# Size of the points for the mean plot
title <- "iN RNA expression HD/Control"
subtitle <- "p-value < 0.05; Log2FC > 0"
rna_norm_in_test2$cexs <- ifelse(rna_norm_in_test2$Pvalue < 0.05 , 1, 0.5)
pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2b.pdf")
plot(log2(rna_norm_in_test2$`Mean CTRL` + 0.5),
log2(rna_norm_in_test2$`Mean HD` + 0.5),
col=rna_norm_in_test2$colours,
cex=rna_norm_in_test2$cexs,
pch=16,
xlab='log2(mean Control)',
ylab='log2(mean HD)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_in_test2$type)["Upregulated"]),")",sep=""),
paste("Down (",as.numeric(table(rna_norm_in_test2$type)["Downregulated"]),")",sep = ""),
paste("Not significant (",as.numeric(table(rna_norm_in_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen
## 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2b.svg")
plot(log2(rna_norm_in_test2$`Mean CTRL` + 0.5),
log2(rna_norm_in_test2$`Mean HD` + 0.5),
col=rna_norm_in_test2$colours,
cex=rna_norm_in_test2$cexs,
pch=16,
xlab='log2(mean Control)',
ylab='log2(mean HD)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_in_test2$type)["Upregulated"]),")",sep=""),
paste("Down (",as.numeric(table(rna_norm_in_test2$type)["Downregulated"]),")",sep = ""),
paste("Not significant (",as.numeric(table(rna_norm_in_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen
## 2
plot(log2(rna_norm_in_test2$`Mean CTRL` + 0.5),
log2(rna_norm_in_test2$`Mean HD` + 0.5),
col=rna_norm_in_test2$colours,
cex=rna_norm_in_test2$cexs,
pch=16,
xlab='log2(mean Control)',
ylab='log2(mean HD)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_in_test2$type)["Upregulated"]),")",sep=""),
paste("Down (",as.numeric(table(rna_norm_in_test2$type)["Downregulated"]),")",sep = ""),
paste("Not significant (",as.numeric(table(rna_norm_in_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
Write to excels up/downregulated genes.
# Significantly different
rna_norm_in_test2_sign <- subset(rna_norm_in_test2, rna_norm_in_test2$type != 'Not significant')
# Up and down regulated for PANTHER
rna_norm_in_test2_upreg <- merge(rna_norm_in_test2[which(rna_norm_in_test2$type == 'Upregulated'),], unique(gene_transcript[,c(2,3)]), by.x='row.names', by.y='gene_id')
rna_norm_in_test2_dwnreg <- merge(rna_norm_in_test2[which(rna_norm_in_test2$type == 'Downregulated'),], unique(gene_transcript[,c(2,3)]), by.x='row.names', by.y='gene_id')
rna_norm_in_test2 <- merge(rna_norm_in_test2, unique(gene_transcript[,c('gene_id', 'gene_name')]), by.x='Gene id', by.y='gene_id')
to_file <- c("Gene", "Mean CTRL", "Mean HD", "Log2FC", "Shapiro Pvalue - CTRL", "Shapiro Pvalue - HD", "T", "Pvalue", "Low conf int", "High conf int", "type")
colnames(rna_norm_in_test2_upreg)[ncol(rna_norm_in_test2_upreg)] <- "Gene"
colnames(rna_norm_in_test2_dwnreg)[ncol(rna_norm_in_test2_dwnreg)] <- "Gene"
writexl::write_xlsx(x = rna_norm_in_test2_upreg[,to_file],
path = paste(path, "tables/STable5.xlsx", sep=''))
write.table(x = rna_norm_in_test2_upreg$Gene,
file = paste(path, "GO_analysis/in_rna_hd.ctrl_upreg.tab", sep=''), quote = F, row.names = F, col.names = F)
writexl::write_xlsx(x = rna_norm_in_test2_dwnreg[,to_file],
path = paste(path, "tables/STable6.xlsx", sep=''))
write.table(rna_norm_in_test2_dwnreg$Gene,
file = paste(path, "GO_analysis/in_rna_hd.ctrl_dwnreg.tab", sep=''), quote = F, row.names = F, col.names = F)
write.table(rna_norm_in_test2$gene_name,
file = paste(path, "GO_analysis/in_rna_hd.ctrl_expressed.tab", sep=''), quote = F, row.names = F, col.names = F)
Here is the head of the results of the upregulated genes in HD (iNs RNAseq).
head(rna_norm_in_test2_upreg)[,to_file]
## Gene Mean CTRL Mean HD Log2FC Shapiro Pvalue - CTRL
## 1 ABCC8 488.72647 1413.57884 1.5322532 0.4273369
## 2 GTF2IRD1 91.30833 123.63382 0.4372551 0.6065881
## 3 ARSD 44.73196 80.58777 0.8492549 0.3820303
## 4 PKD1 44.73004 79.14584 0.8232695 0.5098669
## 5 IFFO1 207.38291 342.02560 0.7218073 0.3536666
## 6 IDS 178.20264 312.82493 0.8118368 0.8691013
## Shapiro Pvalue - HD T Pvalue Low conf int High conf int
## 1 0.2335931 2.776919 0.02586789 146.2829371 1703.42181
## 2 0.4270095 2.756737 0.01748647 6.7558508 57.89513
## 3 0.1309651 2.660571 0.02808792 4.9269143 66.78470
## 4 0.8687674 2.205261 0.04892772 0.1918716 68.63973
## 5 0.8665694 2.306902 0.04006194 7.2192316 262.06614
## 6 0.1191613 2.344800 0.03728975 9.3669987 259.87758
## type
## 1 Upregulated
## 2 Upregulated
## 3 Upregulated
## 4 Upregulated
## 5 Upregulated
## 6 Upregulated
And here is the head of the results of the downregulated genes in HD (iNs RNAseq).
head(rna_norm_in_test2_dwnreg)[,to_file]
## Gene Mean CTRL Mean HD Log2FC Shapiro Pvalue - CTRL
## 1 PRKAR2B 26.36623 12.85520 -1.0363399 0.41802289
## 2 RPS20 1201.85090 693.75556 -0.7927586 0.91741495
## 3 UFL1 1063.95743 843.30861 -0.3353078 0.45025996
## 4 BIRC3 42.74032 20.32487 -1.0723514 0.61914994
## 5 EIPR1 730.14077 614.37986 -0.2490437 0.05751297
## 6 TMSB10 5460.05129 3258.60457 -0.7446602 0.92114047
## Shapiro Pvalue - HD T Pvalue Low conf int High conf int
## 1 0.1209983 -2.401368 0.036005831 -25.95750 -1.064575
## 2 0.5640452 -3.020351 0.013729030 -885.97604 -130.214642
## 3 0.4406767 -2.393680 0.036575038 -424.66449 -16.633138
## 4 0.9288888 -3.355552 0.006003475 -37.03118 -7.799721
## 5 0.5478951 -2.283536 0.041466974 -226.24574 -5.276084
## 6 0.1924914 -2.752384 0.018292121 -3954.73264 -448.160789
## type
## 1 Downregulated
## 2 Downregulated
## 3 Downregulated
## 4 Downregulated
## 5 Downregulated
## 6 Downregulated
Having checked the transcriptional changes between iNs and fibrooblasts, and the differences among the groups between HD and control, we now move on to check if the same differences are observed in the proteomics data.
protein <- fread('/Volumes/My Passport/hd_in/09.19_hd/5_proteomics/20200309_HDProteomics_Log2_SubtractMedianColumn_AverageSample_+20.txt', data.table=F)
samples_in_RNA <- colnames(protein)[which(colnames(protein) %in% coldata[which(coldata$Column %in% colnames(rna)[which(colnames(rna) %in% coldata$Column)]), "Sample"])]
protein <- protein[,c('Gene', samples_in_RNA)]
colnames(protein)[1] <- "gene_id"
protein$gene_unique <- make.unique(protein$gene_id)
colnames(protein)
## [1] "gene_id" "HD1_IN" "HD4_IN" "HD5_IN" "HD6_IN"
## [6] "HDKP_IN" "HDRH_IN" "HDSD_IN" "HD1_FB" "HD4_FB"
## [11] "HD5_FB" "HD6_FB" "HDKP_FB" "HDRH_FB" "HDSD_FB"
## [16] "C11_IN" "C12_IN" "CBS_IN" "CGE_IN" "CKK_IN"
## [21] "CKP_IN" "TS_IN" "C11_FB" "C12_FB" "CBS_FB"
## [26] "CGE_FB" "CKK_FB" "CKP_FB" "TS_FB" "gene_unique"
#
protein_fb <- protein[, c(colnames(protein)[which(colnames(protein) %in% subset(coldata, coldata$CellType == 'FB')$Sample)], 'gene_id', 'gene_unique')]
protein_in <- protein[, c(colnames(protein)[which(colnames(protein) %in% subset(coldata, coldata$CellType == 'IN')$Sample)], 'gene_id', 'gene_unique')]
protein_in[is.na(protein_in)] <- 0
protein_fb[is.na(protein_fb)] <- 0
protein[is.na(protein)] <- 0
coldata$Column <- coldata$Sample
rownames(coldata) <- coldata$Column
rownames(protein) <- protein$gene_unique
protein_test2 <- apply(protein, 1, FUN=t.test_row, coldata, "CellType", "IN", "FB")
protein_test2 <- as.data.frame(t(protein_test2))
protein_test2$`Gene id` <- as.character(protein_test2$`Gene id`)
protein_test2$`Mean IN` <- as.numeric(as.character(protein_test2$`Mean IN`))
protein_test2$`Mean FB` <- as.numeric(as.character(protein_test2$`Mean FB`))
protein_test2$`Log2FC` <- as.numeric(as.character(protein_test2$`Log2FC`))
protein_test2$`Shapiro Pvalue - IN` <- as.numeric(as.character(protein_test2$`Shapiro Pvalue - IN`))
protein_test2$`Shapiro Pvalue - FB` <- as.numeric(as.character(protein_test2$`Shapiro Pvalue - FB`))
protein_test2$T <- as.numeric(as.character(protein_test2$T))
protein_test2$Pvalue <- as.numeric(as.character(protein_test2$Pvalue))
protein_test2$`Low conf int` <- as.numeric(as.character(protein_test2$`Low conf int`))
protein_test2$`High conf int` <- as.numeric(as.character(protein_test2$`High conf int`))
# Filter out genes that are not normally distributed
protein_test2 <- protein_test2[which(protein_test2$`Shapiro Pvalue - IN` > 0.05 & protein_test2$`Shapiro Pvalue - FB` > 0.05),]
# Get the counts of the ones that are normally distributed
# protein <- subset(protein, protein$gene_name %in% protein_test2$`Gene name`)
# Which one of those is significantly different?
protein_signdiff <- protein_test2[which(protein_test2$Pvalue < 0.05),]
# Tag them in one
protein_test2$type <- ifelse(protein_test2$Pvalue < 0.05 & protein_test2$Log2FC < 0, 'Downregulated',
ifelse(protein_test2$Pvalue < 0.05 & protein_test2$Log2FC > 0, 'Upregulated', 'Not significant'))
# Put them colorrsss
protein_test2$colours <- ifelse(protein_test2$type == 'Downregulated', 'steelblue',
ifelse(protein_test2$type == 'Upregulated', 'tomato3', 'black'))
# Size of the points for the mean plot
protein_test2$cexs <- ifelse(rownames(protein_test2) %in% rownames(protein_signdiff) , 1, 0.5)
title <- "Protein abundance FB vs iN"
subtitle <- "p-value < 0.05; Log2FC > 0"
pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig1h.pdf")
plot(log2(protein_test2$`Mean IN`+0.5),
log2(protein_test2$`Mean FB`+0.5),
col=protein_test2$colours,
cex=0.5,
pch=16,
xlab='log2(mean iN)',
ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("up (",as.numeric(table(protein_test2$type)["Upregulated"]),")",sep=""),
paste("down (",as.numeric(table(protein_test2$type)["Downregulated"]),")",sep = ""),
paste("not significant (",as.numeric(table(protein_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen
## 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig1h.svg")
plot(log2(protein_test2$`Mean IN`+0.5),
log2(protein_test2$`Mean FB`+0.5),
col=protein_test2$colours,
cex=0.5,
pch=16,
xlab='log2(mean iN)',
ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("up (",as.numeric(table(protein_test2$type)["Upregulated"]),")",sep=""),
paste("down (",as.numeric(table(protein_test2$type)["Downregulated"]),")",sep = ""),
paste("not significant (",as.numeric(table(protein_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen
## 2
plot(log2(protein_test2$`Mean IN`+0.5),
log2(protein_test2$`Mean FB`+0.5),
col=protein_test2$colours,
cex=0.5,
pch=16,
xlab='log2(mean iN)',
ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("up (",as.numeric(table(protein_test2$type)["Upregulated"]),")",sep=""),
paste("down (",as.numeric(table(protein_test2$type)["Downregulated"]),")",sep = ""),
paste("not significant (",as.numeric(table(protein_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
Write o excels up/downregulated proteins.
# Significantly different
protein_test_sign <- subset(protein_test2, protein_test2$type != 'Not significant')
protein_test_upreg <- protein_test2[which(protein_test2$type == 'Upregulated'),]
protein_test_dwnreg <- protein_test2[which(protein_test2$type == 'Downregulated'),]
to_file <- c("Gene id", "Mean IN", "Mean FB", "Log2FC", "Shapiro Pvalue - IN",
"Shapiro Pvalue - FB", "T", "Pvalue", "Low conf int", "High conf int", "type")
protein_test_upreg_toxlsx <- protein_test_upreg
protein_test_dwnreg_toxlsx <- protein_test_dwnreg
writexl::write_xlsx(x = protein_test_upreg_toxlsx[,to_file],
path = paste(path, "tables/STable3.xlsx", sep=''))
writexl::write_xlsx(x = protein_test_dwnreg_toxlsx[,to_file],
path = paste(path, "tables/STable4.xlsx", sep=''))
Here is the head of the results of the upregulated proteins in FB (Proteomics).
head(protein_test_upreg_toxlsx)[,to_file]
## Gene id Mean IN Mean FB Log2FC Shapiro Pvalue - IN
## VIM VIM 31.67294 33.01509 0.059874894 0.9936218
## ACTG1 ACTG1 31.50417 32.26596 0.034470325 0.2950453
## TUBA1C TUBA1C 29.56989 29.81515 0.011917083 0.7155808
## EEF1A1 EEF1A1 29.44757 30.04114 0.028790701 0.3161377
## HSP90AB1 HSP90AB1 28.54917 28.72486 0.008851087 0.1410632
## ANXA2 ANXA2 28.84764 30.42195 0.076659220 0.9249483
## Shapiro Pvalue - FB T Pvalue Low conf int High conf int
## VIM 0.32118463 9.835915 4.202214e-08 1.05237318 1.6319312
## ACTG1 0.07587443 11.511948 2.162383e-11 0.62538781 0.8981999
## TUBA1C 0.43014051 5.444559 1.728470e-05 0.15191576 0.3386192
## EEF1A1 0.35596163 8.951051 4.198503e-08 0.45440087 0.7327278
## HSP90AB1 0.37396985 3.049715 5.499920e-03 0.05681223 0.2945689
## ANXA2 0.83820806 8.227045 4.884333e-07 1.16752584 1.9810897
## type
## VIM Upregulated
## ACTG1 Upregulated
## TUBA1C Upregulated
## EEF1A1 Upregulated
## HSP90AB1 Upregulated
## ANXA2 Upregulated
Here is the head of the results of the upregulated proteins in iNs (Proteomics).
head(protein_test_dwnreg_toxlsx)[,to_file]
## Gene id Mean IN Mean FB Log2FC Shapiro Pvalue - IN
## GAPDH GAPDH 29.93712 29.64836 -0.013983390 0.51529224
## LMNA LMNA 29.81328 29.67536 -0.006689492 0.09650573
## HIST1H4A HIST1H4A 29.74988 29.07642 -0.033034219 0.41919641
## HSP90B1 HSP90B1 29.72337 28.46676 -0.062319745 0.08843190
## HSPA5 HSPA5 29.53105 28.75311 -0.038514787 0.27184208
## HIST2H2BF HIST2H2BF 29.20504 28.54196 -0.033133076 0.92872778
## Shapiro Pvalue - FB T Pvalue Low conf int
## GAPDH 0.06069343 -3.548079 1.628956e-03 -0.4567060
## LMNA 0.07230358 -2.245891 3.344419e-02 -0.2641557
## HIST1H4A 0.71430467 -7.204543 3.871192e-07 -0.8676815
## HSP90B1 0.95784230 -16.035759 2.570715e-12 -1.4209120
## HSPA5 0.98602990 -12.588038 7.822722e-12 -0.9057475
## HIST2H2BF 0.43526771 -7.427167 2.040011e-07 -0.8482840
## High conf int type
## GAPDH -0.12082455 Downregulated
## LMNA -0.01168088 Downregulated
## HIST1H4A -0.47923947 Downregulated
## HSP90B1 -1.09232198 Downregulated
## HSPA5 -0.65013840 Downregulated
## HIST2H2BF -0.47788101 Downregulated
Similarly, now we test the differences between HD and control among the fibroblasts samples.
protein_fb_test2 <- apply(protein_fb, 1, FUN=t.test_row, coldata, "Stage", "CTRL", "HD")
protein_fb_test2 <- as.data.frame(t(protein_fb_test2))
protein_fb_test2$`Gene id` <- as.character(protein_fb_test2$`Gene id`)
protein_fb_test2$`Mean HD` <- as.numeric(as.character(protein_fb_test2$`Mean HD`))
protein_fb_test2$`Mean CTRL` <- as.numeric(as.character(protein_fb_test2$`Mean CTRL`))
protein_fb_test2$`Log2FC` <- as.numeric(as.character(protein_fb_test2$`Log2FC`))
protein_fb_test2$`Shapiro Pvalue - HD` <- as.numeric(as.character(protein_fb_test2$`Shapiro Pvalue - HD`))
protein_fb_test2$`Shapiro Pvalue - CTRL` <- as.numeric(as.character(protein_fb_test2$`Shapiro Pvalue - CTRL`))
protein_fb_test2$T <- as.numeric(as.character(protein_fb_test2$T))
protein_fb_test2$Pvalue <- as.numeric(as.character(protein_fb_test2$Pvalue))
protein_fb_test2$`Low conf int` <- as.numeric(as.character(protein_fb_test2$`Low conf int`))
protein_fb_test2$`High conf int` <- as.numeric(as.character(protein_fb_test2$`High conf int`))
# Filter out genes that are not normally distributed
protein_fb_test2 <- protein_fb_test2[which(protein_fb_test2$`Shapiro Pvalue - HD` > 0.05 & protein_fb_test2$`Shapiro Pvalue - CTRL` > 0.05),]
# Which one of those is significantly different?
# Tag them in one
protein_fb_test2$type <- ifelse(protein_fb_test2$Pvalue < 0.05 & protein_fb_test2$Log2FC < 0, 'Downregulated',
ifelse(protein_fb_test2$Pvalue < 0.05 & protein_fb_test2$Log2FC > 0, 'Upregulated', 'Not significant'))
# Put them colorrsss
protein_fb_test2$colours <- ifelse(protein_fb_test2$type == 'Downregulated', 'steelblue',
ifelse(protein_fb_test2$type == 'Upregulated', 'tomato3', 'black'))
# Size of the points for the mean plot
protein_fb_test2$cexs <- ifelse(protein_fb_test2$Pvalue < 0.05 , 1, 0.5)
protein_fb_test2$`log2 Mean CTRL` <- log2(protein_fb_test2$`Mean CTRL` + 0.5)
protein_fb_test2$`log2 Mean HD` <- log2(protein_fb_test2$`Mean HD` + 0.5)
plot(protein_fb_test2$`log2 Mean CTRL`,
protein_fb_test2$`log2 Mean HD`,
col=protein_fb_test2$colours,
cex=protein_fb_test2$cexs,
pch=16,
xlab='log2(mean Control)',
ylab='log2(mean HD)',
main='Protein FB - HD vs Control (p-value < 0.05; |log2FC| > 0)')
legend("bottomright", legend = c(paste("up (",as.numeric(table(protein_fb_test2$type)["Upregulated"]),")",sep=""),
paste("down (",as.numeric(table(protein_fb_test2$type)["Downregulated"]),")",sep = ""),
paste("not significant (",as.numeric(table(protein_fb_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
Write to excels up/downregulated proteins
# # Significantly different
# # Means
protein_fb_test_signdiff <- subset(protein_fb_test2, protein_fb_test2$type != 'Not significant')
# Up and down regulated for PANTHER
protein_fb_test_upreg <- protein_fb_test_signdiff[which(protein_fb_test_signdiff$type == 'Upregulated'),]
protein_fb_test_upreg <- merge(protein[,c('gene_id', 'gene_unique')], protein_fb_test_upreg, by.y='Gene id', by.x='gene_id')
# write.table(protein_fb_test_upreg$gene_id, quote=F, col.names = F, row.names = F,
# file=paste(path, 'GO_analysis/fb_protein_hd.ctrl_upreg.tab', sep=''))
protein_fb_test_dwnreg <- protein_fb_test_signdiff[which(protein_fb_test_signdiff$type == 'Downregulated'),]
protein_fb_test_dwnreg <- merge(protein[,c('gene_id', 'gene_unique')], protein_fb_test_dwnreg, by.y='Gene id', by.x='gene_unique')
# write.table(protein_fb_test_dwnreg$gene_id, quote=F, col.names = F, row.names = F,
# file=paste(path, 'GO_analysis/fb_protein_hd.ctrl_dwnreg.tab', sep=''))
#
# write.table(protein_fb_test2$`Gene id`, quote=F, col.names = F, row.names = F,
# file=paste(path, 'GO_analysis/fb_protein_hd.ctrl_expressed.tab', sep=''))
#
# write.xlsx(protein_fb_test_signdiff[,-c((ncol(protein_fb_test_signdiff)-3):ncol(protein_fb_test_signdiff))],
# file=paste(getwd(), '/5_proteomics/GO_analysis/fb_protein_hd.ctrl_signdiff.xlsx', sep=''), row.names = F)
Here is a heatmap of authophagy related proteins.
autophagy_protein_in_control <- protein_in[which(protein_in$gene_id %in% autophagy), ]
rownames(autophagy_protein_in_control) <- autophagy_protein_in_control$gene_unique
autophagy_protein_in_control <- autophagy_protein_in_control[,subset(coldata, coldata$CellType == 'IN' & coldata$Stage == "CTRL")$Sample]
autophagy_protein_in_control_colannot <- coldata[which(coldata$CellType == 'IN' & coldata$Stage == "CTRL"), c("AgeContinuous", "RealName"), drop=F]
rownames(autophagy_protein_in_control_colannot) <- autophagy_protein_in_control_colannot$RealName
colnames(autophagy_protein_in_control) <- coldata[colnames(autophagy_protein_in_control), "RealName"]
pheatmap(autophagy_protein_in_control,
scale='row',
annotation_col = autophagy_protein_in_control_colannot[,"AgeContinuous", drop=F], cluster_rows = T,
fontsize = 7)
We now test for differences between HD and control among the iN samples - T-tests of iNs, HD vs control - Mean plot
# Protein iN ----
protein_in_test2 <- apply(protein_in, 1, FUN=t.test_row, coldata, "Stage", "CTRL", "HD")
protein_in_test2 <- as.data.frame(t(protein_in_test2))
protein_in_test2$`Gene id` <- as.character(protein_in_test2$`Gene id`)
protein_in_test2$`Mean HD` <- as.numeric(as.character(protein_in_test2$`Mean HD`))
protein_in_test2$`Mean CTRL` <- as.numeric(as.character(protein_in_test2$`Mean CTRL`))
protein_in_test2$`Log2FC` <- as.numeric(as.character(protein_in_test2$`Log2FC`))
protein_in_test2$`Shapiro Pvalue - HD` <- as.numeric(as.character(protein_in_test2$`Shapiro Pvalue - HD`))
protein_in_test2$`Shapiro Pvalue - CTRL` <- as.numeric(as.character(protein_in_test2$`Shapiro Pvalue - CTRL`))
protein_in_test2$T <- as.numeric(as.character(protein_in_test2$T))
protein_in_test2$Pvalue <- as.numeric(as.character(protein_in_test2$Pvalue))
protein_in_test2$`Low conf int` <- as.numeric(as.character(protein_in_test2$`Low conf int`))
protein_in_test2$`High conf int` <- as.numeric(as.character(protein_in_test2$`High conf int`))
# Filter out genes that are not normally distributed
protein_in_test2 <- protein_in_test2[which(protein_in_test2$`Shapiro Pvalue - HD` > 0.05 & protein_in_test2$`Shapiro Pvalue - CTRL` > 0.05),]
# Which one of those is significantly different?
protein_signdiff_in <- protein_in_test2[which(protein_in_test2$Pvalue < 0.05 ),]
# Tag them in one
protein_in_test2$type <- ifelse(protein_in_test2$Pvalue < 0.05 & protein_in_test2$Log2FC < 0, 'Downregulated',
ifelse(protein_in_test2$Pvalue < 0.05 & protein_in_test2$Log2FC > 0, 'Upregulated', 'Not significant'))
# Put them colorrsss
protein_in_test2$colours <- ifelse(protein_in_test2$type == 'Downregulated', 'steelblue',
ifelse(protein_in_test2$type == 'Upregulated', 'tomato3', 'black'))
# Size of the points for the mean plot
protein_in_test2$cexs <- ifelse(protein_in_test2$type != "Not significant" , 1, 0.5)
title <- "Protein iN - HD vs Control"
subtitle <- "p-value < 0.05; Log2FC > 0"
pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2c.pdf")
plot(log2(protein_in_test2$`Mean CTRL`),
log2(protein_in_test2$`Mean HD`),
col=protein_in_test2$colours,
cex=protein_in_test2$cexs,
pch=16,
xlab='log2(mean Control)',
ylab='log2(mean HD)',
ylim=c(3.7, 5), xlim=c(3.8, 5))#, ylim=c(14,30), xlim=c(14,30))
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(protein_in_test2$type)["Upregulated"]),")",sep=""),
paste("Down (",as.numeric(table(protein_in_test2$type)["Downregulated"]),")",sep = ""),
paste("Not significant (",as.numeric(table(protein_in_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen
## 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2c.svg")
plot(log2(protein_in_test2$`Mean CTRL`),
log2(protein_in_test2$`Mean HD`),
col=protein_in_test2$colours,
cex=protein_in_test2$cexs,
pch=16,
xlab='log2(mean Control)',
ylab='log2(mean HD)',
ylim=c(3.7, 5), xlim=c(3.8, 5))#, ylim=c(14,30), xlim=c(14,30))
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(protein_in_test2$type)["Upregulated"]),")",sep=""),
paste("Down (",as.numeric(table(protein_in_test2$type)["Downregulated"]),")",sep = ""),
paste("Not significant (",as.numeric(table(protein_in_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen
## 2
plot(log2(protein_in_test2$`Mean CTRL`),
log2(protein_in_test2$`Mean HD`),
col=protein_in_test2$colours,
cex=protein_in_test2$cexs,
pch=16,
xlab='log2(mean Control)',
ylab='log2(mean HD)',
ylim=c(3.7, 5), xlim=c(3.8, 5))#, ylim=c(14,30), xlim=c(14,30))
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(protein_in_test2$type)["Upregulated"]),")",sep=""),
paste("Down (",as.numeric(table(protein_in_test2$type)["Downregulated"]),")",sep = ""),
paste("Not significant (",as.numeric(table(protein_in_test2$type)["Not significant"]),")",sep = "")),
pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
Write to excels up/downregulated proteins
protein_in_test_upreg <- protein_in_test2[which(protein_in_test2$type == 'Upregulated'),]
protein_in_test_upreg <- merge(protein[,c('gene_id', 'gene_unique')], protein_in_test_upreg, by.y='Gene id', by.x='gene_unique')
protein_in_test_dwnreg <- protein_in_test2[which(protein_in_test2$type == 'Downregulated'),]
protein_in_test_dwnreg <- merge(protein[,c('gene_id', 'gene_unique')], protein_in_test_dwnreg, by.y='Gene id', by.x='gene_unique')
protein_in_test2 <- merge(protein[,c('gene_id', 'gene_unique')], protein_in_test2, by.y='Gene id', by.x='gene_unique')
colnames(protein_in_test_upreg)[2] <- "Gene id"
colnames(protein_in_test_dwnreg)[2] <- "Gene id"
to_file <- c("Gene id", "Mean CTRL", "Mean HD", "Log2FC", "Shapiro Pvalue - CTRL", "Shapiro Pvalue - HD", "T", "Pvalue", "Low conf int", "High conf int", "type")
writexl::write_xlsx(x = protein_in_test_upreg[,to_file],
path = paste(path, "tables/STable9.xlsx", sep=''))
writexl::write_xlsx(x = protein_in_test_dwnreg[,to_file],
path = paste(path, "tables/STable10.xlsx", sep=''))
Here is the head of the results of the upregulated proteins in HD (iN Proteomics).
head(protein_in_test_upreg)[,to_file]
## Gene id Mean CTRL Mean HD Log2FC Shapiro Pvalue - CTRL
## 1 AACS 21.20636 21.52696 0.02164786 0.49752565
## 2 ABCB8 19.87560 20.27344 0.02859221 0.07826388
## 3 ABTB2 15.43905 16.50342 0.09618117 0.15973093
## 4 ACAA2 25.12628 25.64100 0.02925526 0.06554315
## 5 ACTN2 16.56340 16.96443 0.03451404 0.26420543
## 6 ACTR1B 19.51741 19.89266 0.02747468 0.44729685
## Shapiro Pvalue - HD T Pvalue Low conf int High conf int
## 1 0.43857957 3.284993 0.006981241 0.10661784 0.5345902
## 2 0.90270828 2.785704 0.027502665 0.05906827 0.7366037
## 3 0.91918465 3.506902 0.007176930 0.37250484 1.7562380
## 4 0.09021916 2.333240 0.038194573 0.03312353 0.9963102
## 5 0.34861162 2.695730 0.025791653 0.06152618 0.7405324
## 6 0.72843300 2.583030 0.027986011 0.05001310 0.7004895
## type
## 1 Upregulated
## 2 Upregulated
## 3 Upregulated
## 4 Upregulated
## 5 Upregulated
## 6 Upregulated
Here is the head of the results of the downregulated proteins in HD (iN Proteomics).
head(protein_in_test_dwnreg)[,to_file]
## Gene id Mean CTRL Mean HD Log2FC Shapiro Pvalue - CTRL
## 1 ABRAXAS2 20.18307 19.87083 -0.022493367 0.9209567
## 2 ACE 18.55312 17.45660 -0.087889355 0.3405611
## 3 ACSL4 21.52899 21.24862 -0.018911441 0.7493673
## 4 ACTBL2 20.24593 19.88613 -0.025869173 0.9210712
## 5 ACYP1 20.65147 20.41381 -0.016699240 0.8644375
## 6 ALG5 23.36715 23.22187 -0.008997331 0.1827212
## Shapiro Pvalue - HD T Pvalue Low conf int High conf int
## 1 0.6982527 -2.369834 0.036583812 -0.6012611 -0.023215222
## 2 0.6152739 -3.008697 0.013100266 -1.9082042 -0.284839539
## 3 0.9200674 -2.470364 0.030481170 -0.5292058 -0.031532768
## 4 0.1026338 -2.288133 0.047256325 -0.7142159 -0.005378699
## 5 0.1764699 -2.465264 0.032162895 -0.4508711 -0.024455483
## 6 0.7863622 -4.624208 0.001669147 -0.2176314 -0.072918892
## type
## 1 Downregulated
## 2 Downregulated
## 3 Downregulated
## 4 Downregulated
## 5 Downregulated
## 6 Downregulated
We have came up with the following groups of genes/proteins from the tests between HD vs control - Upregulated in HD (proteomics of fibroblasts) –> Protein.Upreg.FB - Upregulated in HD (proteomics of iNs) –> Protein.Upreg.iN - Upregulated in HD (RNAseq of fibroblasts) –> RNA.Upreg.FB - Upregulated in HD (RNAseq of iNs) –> RNA.Upreg.iN - Downregulated in HD (proteomics of fibroblasts) –> Protein.Downreg.FB - Downregulated in HD (proteomics of iNs) –> Protein.Downreg.iN - Downregulated in HD (RNAseq of fibroblasts) –> RNA.Downreg.FB - Downregulated in HD (RNAseq of iNs) –> RNA.Downreg.iN
To visualize how many genes/proteins are being shared between the groups, here is an upset plot:
to_file <- c("Gene", "Mean CTRL", "Mean HD", "Log2FC", "Shapiro Pvalue - CTRL",
"Shapiro Pvalue - HD", "T", "Pvalue", "Low conf int", "High conf int", "type")
colnames(rna_norm_in_test2_upreg)[ncol(rna_norm_in_test2_upreg)] <- "Gene"
colnames(rna_norm_in_test2_upreg)[1] <- "gene_id"
colnames(rna_norm_in_test2_dwnreg)[ncol(rna_norm_in_test2_dwnreg)] <- "Gene"
colnames(rna_norm_in_test2_dwnreg)[1] <- "gene_id"
colnames(rna_norm_fb_test2_upreg)[ncol(rna_norm_fb_test2_upreg)] <- "Gene"
colnames(rna_norm_fb_test2_upreg)[1] <- "gene_id"
colnames(rna_norm_fb_test2_dwnreg)[ncol(rna_norm_fb_test2_dwnreg)] <- "Gene"
colnames(rna_norm_fb_test2_dwnreg)[1] <- "gene_id"
# UPSET
input <- data.frame(gene_name=unique(c(rna_norm_in_test2_upreg$Gene,
rna_norm_fb_test2_upreg$Gene,
protein_in_test_upreg$`Gene id`,
protein_fb_test_upreg$gene_id,
rna_norm_in_test2_dwnreg$Gene,
rna_norm_fb_test2_dwnreg$Gene,
protein_in_test_dwnreg$`Gene id`,
protein_fb_test_dwnreg$gene_id)))
input$RNA.Upreg.iN <- ifelse(input$gene_name %in% rna_norm_in_test2_upreg$Gene, 1, 0)
input$RNA.Upreg.FB <- ifelse(input$gene_name %in% rna_norm_fb_test2_upreg$Gene, 1, 0)
input$Protein.Upreg.iN <- ifelse(input$gene_name %in% protein_in_test_upreg$`Gene id`, 1, 0)
input$Protein.Upreg.FB <- ifelse(input$gene_name %in% protein_fb_test_upreg$gene_id, 1, 0)
input$RNA.Downreg.iN <- ifelse(input$gene_name %in% rna_norm_in_test2_dwnreg$Gene, 1, 0)
input$RNA.Downreg.FB <- ifelse(input$gene_name %in% rna_norm_fb_test2_dwnreg$Gene, 1, 0)
input$Protein.Downreg.iN <- ifelse(input$gene_name %in% protein_in_test_dwnreg$`Gene id`, 1, 0)
input$Protein.Downreg.FB <- ifelse(input$gene_name %in% protein_fb_test_dwnreg$gene_id, 1, 0)
library(UpSetR)
##
## Attaching package: 'UpSetR'
## The following object is masked from 'package:lattice':
##
## histogram
upset_plot <- upset(input, sets = c("RNA.Downreg.iN", # #417096
"RNA.Downreg.FB", # #417096
"Protein.Downreg.iN", # #9dc3e3
"Protein.Downreg.FB", # #9dc3e3
"RNA.Upreg.iN", # #d15685
"RNA.Upreg.FB", # #d15685
"Protein.Upreg.iN", # #f2a7c4
"Protein.Upreg.FB"), # #f2a7c4
mainbar.y.label = "Gene Intersections", sets.x.label = "Genes Per Group",
keep.order=TRUE,
sets.bar.color=c('#417096', '#417096', '#9dc3e3', '#9dc3e3', '#d15685', '#d15685', '#f2a7c4', '#f2a7c4'))
pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2d.pdf")
upset_plot
dev.off()
## quartz_off_screen
## 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2d.svg")
upset_plot
dev.off()
## quartz_off_screen
## 2
upset_plot
writexl::write_xlsx(x = input,
path = paste(path, "tables/STable11.xlsx", sep=''))
As you can see from the plot, there is very little intersection among the groups of dysregulated genes.
Here we just convert the results from the GO analysis (PANTHER version 16) to excel files. The GO analysis was performed with the up and downregulated genes in iN RNAseq HD vs Control.
rna_norm_in_test2_upreg_GO <- fread(paste(path, "GO_analysis/STable7.txt", sep=""), fill=T, skip = 11, sep="\t")
rna_norm_in_test2_upreg_GO <- rna_norm_in_test2_upreg_GO[order(rna_norm_in_test2_upreg_GO$`in_rna_hd.ctrl_upreg.tab (FDR)`),]
writexl::write_xlsx(rna_norm_in_test2_upreg_GO, paste(path, "tables/STable7.xlsx", sep=""), col_names = T)
rna_norm_in_test2_dwnreg_GO <- fread( paste(path, "GO_analysis/STable8.txt", sep=""), fill=T, skip = 11, sep="\t")
rna_norm_in_test2_dwnreg_GO <- rna_norm_in_test2_dwnreg_GO[order(rna_norm_in_test2_dwnreg_GO$`in_rna_hd.ctrl_dwnreg.tab (FDR)`),]
writexl::write_xlsx(rna_norm_in_test2_dwnreg_GO, paste(path,"tables/STable8.xlsx", sep=""), col_names = T)
Gene set enrichment analysis (iNs RNAseq) shows a positive enrichment to ZNF23 and ZNF16 target genes.
library(clusterProfiler)
##
## clusterProfiler v4.0.2 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, doi: 10.1016/j.xinn.2021.100141
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:lattice':
##
## dotplot
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
library(enrichplot)
##
## Attaching package: 'enrichplot'
## The following object is masked from 'package:lattice':
##
## dotplot
## The following object is masked from 'package:ggpubr':
##
## color_palette
library(msigdbr)
set.seed(10)
# GSEA ----
m_tfs <- msigdbr(species = "Homo sapiens", category = "C3") %>%
dplyr::select(gs_name, human_gene_symbol)#, human_gene_symbol)
## feature 1: numeric vector
geneList <- rna_norm_in_test2$Log2FC
## feature 2: named vector
names(geneList) <- as.character(rna_norm_in_test2$gene_name)
## feature 3: decreasing order
geneList <- sort(geneList, decreasing = TRUE)
em2 <- GSEA(geneList, TERM2GENE = m_tfs, seed = T)
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
gseaplot2(em2, geneSetID = c("ZNF23_TARGET_GENES", "ZNF16_TARGET_GENES"),
title = "Enrichment of ZNF23 and ZNF16 target genes")
Number of positively enriched gene sets
# Positively enriched
nrow(em2@result[which(em2@result$enrichmentScore > 0),])
## [1] 2
Number of negatively enriched gene sets
# Negatively enriched
nrow(em2@result[which(em2@result$enrichmentScore < 0),])
## [1] 116
There are the negatively enriched gene sets which are not miRNAs (as we obviously won’t see those in the proteomics..)
# Negatively enriched not miRNAs
em2@result[which(em2@result$enrichmentScore < 0 & !grepl("MIR", em2@result$ID)),]
## ID Description setSize
## DLX4_TARGET_GENES DLX4_TARGET_GENES DLX4_TARGET_GENES 449
## ZZZ3_TARGET_GENES ZZZ3_TARGET_GENES ZZZ3_TARGET_GENES 68
## TAF9B_TARGET_GENES TAF9B_TARGET_GENES TAF9B_TARGET_GENES 289
## HMGB2_TARGET_GENES HMGB2_TARGET_GENES HMGB2_TARGET_GENES 203
## OCT1_04 OCT1_04 OCT1_04 126
## GATA6_01 GATA6_01 GATA6_01 129
## enrichmentScore NES pvalue p.adjust
## DLX4_TARGET_GENES -0.3333336 -1.858087 1.309810e-10 1.463494e-07
## ZZZ3_TARGET_GENES -0.4421819 -1.871199 1.437797e-04 1.175487e-02
## TAF9B_TARGET_GENES -0.2777385 -1.484187 4.998478e-04 2.204592e-02
## HMGB2_TARGET_GENES -0.3021709 -1.528563 7.425028e-04 2.884510e-02
## OCT1_04 -0.3495798 -1.646258 1.035975e-03 3.445199e-02
## GATA6_01 -0.3322013 -1.567699 1.352440e-03 4.121254e-02
## qvalues rank leading_edge
## DLX4_TARGET_GENES 1.326814e-07 4000 tags=42%, list=28%, signal=31%
## ZZZ3_TARGET_GENES 1.065705e-02 3153 tags=41%, list=22%, signal=32%
## TAF9B_TARGET_GENES 1.998699e-02 4070 tags=41%, list=29%, signal=30%
## HMGB2_TARGET_GENES 2.615118e-02 3509 tags=35%, list=25%, signal=27%
## OCT1_04 3.123442e-02 2823 tags=31%, list=20%, signal=25%
## GATA6_01 3.736359e-02 3053 tags=33%, list=22%, signal=26%
## core_enrichment
## DLX4_TARGET_GENES ZNF217/CDC14B/TARS2/COPS7A/CSNK1G1/SUPT7L/OS9/MTMR9/HAUS6/ZRANB2-AS2/VPS33B-DT/CNOT1/AC012184.3/TRUB2/SLC50A1/C12orf76/RNF167/ZNF169/GPN3/DPAGT1/CWC25/ATG3/ZDHHC6/GTPBP8/TDG/MPHOSPH10/ANKIB1/AP3S2/RPS9/RHOC/CCDC191/WDPCP/CDK12/RPLP1/GBA/PPWD1/TLE4/FANCD2/EMC4/EED/CSPP1/TRMO/AAAS/NUP155/ERAL1/COG5/GPNMB/DZANK1/LRRC57/TBL1XR1/GORAB/SELENOI/ZBTB37/CDK5RAP1/THADA/AAR2/C12orf73/MRPS31P5/BPGM/PANK3/TOB1/DDX21/PAN2/MCCC1/SELENOP/SYF2/PURB/KNTC1/FKBP15/RACK1/DTWD1/MCEE/GFM2/PPIL3/TUBD1/RBFOX2/NUP205/SEC61B/CD70/RBBP5/CCDC77/ZNF830/LCMT1-AS1/RSRC2/LSM8/EMG1/WDR92/UTP3/HNRNPH1/DDX23/DNAJC19/RPP14/MTFR2/VPS4B/SCRN3/CMSS1/LAPTM4A/C19orf53/MDH1B/PIH1D2/FKBP3/POC5/RAB3GAP1/TIGD4/NME7/NAGK/CDKAL1/FAU/HINT1/NKAPD1/LGALSL/ATG4C/EIF1AD/TRMT12/BLOC1S1/SUGT1/UBN2/GAS5/GTF2E1/FUBP1/PFKM/POLR3F/HTD2/SF3B5/TOX4/POLQ/PPP1R11/KCTD16/SELENOF/WDR82/XPO1/TIMM8B/TTC26/KRR1/EMC3/CLCN3/ZW10/QTRT2/ALKBH2/HIBCH/DNAJB4/MMACHC/NDUFAF2/HPS5/LRRC40/RRP15/DPH3/PHF5A/LAMA3/SNRNP27/NUCKS1/FGF7/ARFIP1/FAM227B/MAP3K13/ITGB3BP/ZMAT2/CHP1/RPL14/WDR53/PAIP2B/NSA2/RPL9/ZNF19/HAX1/NXT2/LIN7C/C9orf43/SERPINI1/MRPL13/LNP1/PUS10/RPL23/ELOC/TMEM267/PFDN6/RBM15-AS1/ATP5MF/GEMIN2/DDIAS/TOMM7/GORAB-AS1/SEM1/MACC1/HNMT/COX7A2/CCDC163/PET100/GCNT3
## ZZZ3_TARGET_GENES TMEM18/POLH-AS1/NUP155/TK1/SNHG25/WDR76/SAT1/GFM2/SLC25A44/RPS23/ZMYM3/WDR92/RAB18/RAB5B/RPS3A/TOX4/CSTF2T/EMC3/ESF1/ITGB3BP/C21orf62-AS1/NSA2/RPS18/RPS20/MECOM/ACYP1/ZBTB20-AS4/KCNJ5
## TAF9B_TARGET_GENES SNHG15/INTS6-AS1/TFRC/ZNF518A/RPL28/SUGCT/PUM1/SUPT7L/NDUFA7/SNHG1/LINC01088/CCNC/PTMA/ATRX/AGPAT4/NDUFC1/ZNF257/HAUS8/TRIP12/CNPY4/MRPL20/RICTOR/AP3S2/NET1/CDK12/NSL1/COG4/ID2-AS1/ELP4/RPL3/NDUFC2/POLR2C/RPRD2/RPL13A/RPL29/ZNF343/ZBTB37/AP002026.1/TM9SF4/AAR2/AFF1/MRPS31P5/MRPL39/SMIM27/BEX4/NMT1/THOC2/GNL3/GOSR1/S100A2/KNTC1/MAF/BMS1/RACK1/ELOB/RBM34/SAT1/PTEN/TM2D1/ODC1/RPS28/RSRC2/FAM133B/EEF1A1/CD63/MCPH1-AS1/IQCG/RCOR1/EIF4B/TOMM22/SNHG6/NAGK/EIF1AD/ATF1/GAS5/ZC3H6/SNHG5/ANP32E/AC009779.2/SELENOF/RPS6/KRR1/CLCN3/RPL12/RPL7A/DNAJB4/LIN9/RBM27/SPTA1/STMP1/SNRNP27/FXR1/STYXL1/ZMAT2/RPL7/RPL14/RPL26/C21orf62-AS1/FAM162A/SEC62/RPL35/CYCS/MRPL42/RPL11/CHM/MRPL13/LINC01275/RPL31/RBM15-AS1/RPS4X/TXN/ATP5MF/LINC01270/DDIT3/PFDN4/RPL39/RPS29/COX7A2
## HMGB2_TARGET_GENES ADK/TSN/COG4/BAG6/PPWD1/KMT2A/TPD52/ANXA2/TTC8/CIAO2A/DZANK1/TBL1XR1/GORAB/DDX39B/BMPR1B/LINC02458/LZTFL1/RABGGTB/RN7SKP11/SUCLG1/IRGQ/NUP85/DNAJC24/EMSY/ELOVL6/RPL23AP82/NUP205/SEC61B/RBBP5/KLHL4/FARS2/RPS23/EPCAM-DT/SPAG7/HCFC1R1/MRPS7/WDR92/ZNF620/IQCG/UPF2/G2E3/XPNPEP3/NME7/RWDD1/RNF182/UBE2E1/RGS5/VPS50/CA13/POLR3F/ANP32E/LINC00652/PPP1R11/RPL23AP7/SELENOF/NR3C2/C19orf38/EMC3/SNRPN/LRRC40/ZNF510/NUCKS1/MAP3K13/RPS12/RPL9/MT-RNR1/RSL24D1/COX7A2L/RGS4/GORAB-AS1/MIR100HG/ERG
## OCT1_04 ETV1/CHD6/CNMD/TIAF1/LDB2/PTEN/SKIDA1/GABRG2/DLG2/BCL11B/HOXC6/CASK/C8orf82/FGF13/SP6/ELAVL4/RUNX1/ISL1/SLC38A4/PDE1A/PPP3CB/LMO3/NR3C2/GALNT1/AGBL4/EYA1/TP53INP2/JAZF1/CHM/SERTAD4/ZFPM2/ARHGAP30/HOXC4/GPM6A/HOXB3/CNTN6/C8orf34/GRIK3/PAX6
## GATA6_01 CIAO2A/ECT2/BPGM/PUM2/DDR2/HNRNPA2B1/ESRRG/NR5A2/CARD6/EDA/ATP6V0A4/SALL2/GATA6/SMAD5/SKIDA1/POU2AF1/MATN1/PLAC1/LSAMP/ZNF781/LRCH2/ELAVL4/PSMA1/ISL1/HS3ST5/PPARGC1A/ENO2/HOXB6/LMO3/KCNH5/EYA1/ITGB3BP/ZFPM2/HOXC4/BCO1/HOXB3/CFAP161/HOXA4/SSTR3/ERG/DCX/KLB
Among those, just TAF9 and HMGB2 were captured by the proteomics, so here are the boxplots of those two proteins in the iN’s proetomics
negatively_enriched <- sapply(str_split(em2@result[which(em2@result$enrichmentScore < 0 & !grepl("MIR", em2@result$ID)), "ID"], "_"), `[[`, 1)
negatively_enriched_counts <- protein_in[which(protein_in$gene_id %in% c("TAF9", "HMGB2")),]
rownames(negatively_enriched_counts) <- negatively_enriched_counts$gene_id
negatively_enriched_counts <- negatively_enriched_counts[,c(-ncol(negatively_enriched_counts))]
negatively_enriched_counts_melt <- reshape2::melt(negatively_enriched_counts, by='gene_id')
## Using gene_id as id variables
negatively_enriched_counts_melt <- merge(negatively_enriched_counts_melt, coldata_in, by.x = "variable", by.y='Sample')
ggplot(negatively_enriched_counts_melt, aes(x=Stage, y=value, fill=Stage)) + geom_boxplot() + facet_wrap(.~gene_id, scales = "free")+ theme_classic() +
labs(x="Condition", fill = "Condition", y="Protein levels") + ggtitle("Protein (iN) abundance of the negatively enriched gene sets in RNAseq (iN)")
Here we checked if we could see a trend in the log2FCs (iNs proteomics results - HD vs control) of the target genes, of the gene sets that we saw enriched in the iN HD vs control RNAseq results (positively or negatively).
genes_genesets_upreg <- unlist(str_split(paste(em2@result[which(em2@result$enrichmentScore > 0),"core_enrichment"], collapse = "/"), "/"))
positively_enriched_targets_counts <- protein_in_test2[which(protein_in_test2$gene_id %in% genes_genesets_upreg),]
genes_genesets_dwnreg <- unlist(str_split(paste(em2@result[which(em2@result$enrichmentScore < 0),"core_enrichment"], collapse = "/"), "/"))
negatively_enriched_targets_test <- protein_in_test2[which(protein_in_test2$gene_id %in% genes_genesets_dwnreg),]
# How many of these targets were captured in our proteomics?
table(genes_genesets_upreg %in% protein$gene_id)
##
## FALSE TRUE
## 21 13
table(genes_genesets_dwnreg %in% protein$gene_id)
##
## FALSE TRUE
## 4990 4074
library(ggpubr)
ggarrange(
ggplot(positively_enriched_targets_counts, aes(x=1, y=Log2FC)) +
geom_boxplot() + geom_hline(yintercept = 0, linetype = "dashed", colour="red") +
theme_classic() + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + ggtitle("Log2FC of protein targets of enriched TF among\nupregulated genes in RNAseq"),
ggplot(negatively_enriched_targets_test, aes(x=1, y=Log2FC)) +
geom_boxplot() + geom_hline(yintercept = 0, linetype = "dashed", colour="red") +
theme_classic() + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + ggtitle("Log2FC of protein targets of enriched TF among\ndownregulated genes in RNAseq"))
To rule out hetereogeneity of the RNAseq samples causing lower statistical power and consequently not finding these genes/proteins among the dysregulated genes in the RNAseq data, we checked the RNA expression of the top 10 up and downregulated proteins in iNs.
# Heterogeneity of samples? ----
# this changed bc of log2FC calculation... my bad
top10_upreg_protein_in <- head(protein_in_test_upreg[order(protein_in_test_upreg$Log2FC, decreasing = T),], 10)$Gene
top10_dwnreg_protein_in <- head(protein_in_test_dwnreg[order(protein_in_test_dwnreg$Log2FC, decreasing = T),], 10)$Gene
coldata <- fread('/Volumes/My Passport/hd_in/09.19_hd/CategoricalAnnotation.txt', data.table = F)
rownames(coldata) <- coldata$Column
coldata$Sample <- as.character(coldata$Sample)
coldata$Column <- as.character(coldata$Column)
rna <- fread('/Volumes/My Passport/hd_in/02.08.20_hd/3_stdmapping/1_readcounts/gene_count_matrix_0_p.csv', data.table=F)
rownames(rna) <- rna$Geneid
colnames(rna)[7:ncol(rna)] <- sapply(str_split(colnames(rna)[7:ncol(rna)], "/"), `[[`, 5)
rna_expressed <- rna[which(apply(rna[,coldata$Column], 1, more_10) > 0), coldata$Column]
dds_rna <- DESeqDataSetFromMatrix(rna_expressed[,coldata$Column], coldata, design= ~Stage)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_rna$Stage <- relevel(dds_rna$Stage, 'CTRL')
dds_rna <- DESeq(dds_rna)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 528 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
rna_expressed_norm <- as.data.frame(counts(dds_rna, normalize=T))
rna_expressed_norm <- merge(rna_expressed_norm, unique(gene_transcript[,c("gene_id", "gene_name")]), by.x="row.names", by.y="gene_id")
top10_upreg_protein_in_df <- rna_expressed_norm[which(rna_expressed_norm$gene_name %in% top10_upreg_protein_in), ]
top10_dwnreg_protein_in_df <- rna_expressed_norm[which(rna_expressed_norm$gene_name %in% top10_dwnreg_protein_in), ]
top10_upreg_protein_in_df <- reshape2::melt(top10_upreg_protein_in_df[,-1], by="gene_name")
## Using gene_name as id variables
top10_dwnreg_protein_in_df <- reshape2::melt(top10_dwnreg_protein_in_df[,-1], by="gene_name")
## Using gene_name as id variables
top10_upreg_protein_in_df <- merge(top10_upreg_protein_in_df, coldata_in, by.x='variable', by.y="Column")
top10_dwnreg_protein_in_df <- merge(top10_dwnreg_protein_in_df, coldata_in, by.x='variable', by.y="Column")
ggplot(top10_upreg_protein_in_df, aes(x=Stage, y=value, fill=Stage)) + geom_boxplot() +
facet_wrap(.~gene_name, scales = "free_y") + theme_classic() +
ggtitle("Top 10 upregulated proteins in RNAseq (iNs)") +
labs(y="Normalized expression (median of ratios)", fill="Condition", x="Condition")
ggplot(top10_dwnreg_protein_in_df, aes(x=Stage, y=value, fill=Stage)) + geom_boxplot() +
facet_wrap(.~gene_name, scales = "free_y") + theme_classic() +
ggtitle("Top 10 downregulated proteins in RNAseq (iNs)") +
labs(y="Normalized expression (median of ratios)", fill="Condition", x="Condition")
To answer the question to if iNs resembled more to cortical or striatal neurons we plot, both in RNA expression and protein abundance, markers of those two regions (taken from Linnarsson Mouse Atlas).
# Cortical vs striatal ----
striatal <- toupper(unique(c("Drd1", "Ido1", "Ppp1r1b", "Adora2a", "Ido1", "Adora2a", "Gpr83", "Penk", "Drd1", "Dclk3", "Mhrt", "Drd1", "Lrpprc", "Tac1", "Wnt2", "A230065H16Rik", "Zic1", "2810459M11Rik", "Gm14964")))
striatal <- striatal[which(striatal %in% rna_expressed_norm$gene_name)]
cortical <- toupper(unique(c("Myl4", "Cpne4", "Rprm", "Rell1", "Cplx3", "Nxph3", "Hs3st4", "Sulf1", "Prss12", "Igfbp6", "C130074G19Rik", "Nr4a2", "Col24a1", "Oprk1", "Hs3st2", "Vipr1", "Pde1a", "Dkkl1", "Galntl6", "Krt12", "Tcap", "A830009L08Rik", "Gm12371", "Lamp5", "Tshz2", "Ddit4l", "Wfs1", "RP24-134N2.1", "Vwc2l", "Cbln1", "Ntsr1", "Rxfp1", "Fermt1", "RP23-231J2.1", "Dbpht2", "Cdkl4", "Scube1", "Tekt5", "Trim54", "Slc30a3", "Lmo3", "Abi3bp", "4930426D05Rik", "Ndst4", "Klhl14", "Rgs14", "Oxtr", "Bmp3", "Fezf2", "RP24-134N2.1", "Kcng1", "Pthlh", "Cox6a2", "Rbp4", "Cox6a2", "Cort", "Sst", "Ccna1", "Crhbp", "Tacr1", "Chodl", "Lhx6", "Pde11a", "Npy", "Ptchd2", "Cplx3", "Teddm3", "Krt73", "Stk32b", "5330429C05Rik", "Yjefn3", "Hdhd3", "Npas1", "Col19a1", "Slc17a8", "Vip", "Gm17750", "Cxcl14", "Vip", "Tiam1", "Tac2", "Epha7")))
cortical <- cortical[which(cortical %in% rna_expressed_norm$gene_name)]
markers <- data.frame(markers = c(cortical,striatal),
type = c(rep("cortical", length(cortical)), rep("striatal", length(striatal))))
rownames(markers) <- markers$markers
rownames(rna_expressed_norm) <- make.unique(rna_expressed_norm$gene_name)
markers_rna_norm <- rna_expressed_norm[which(rna_expressed_norm$gene_name %in% markers$markers),coldata_in$Column]
markers_rna_norm <- markers_rna_norm[markers$markers,]
pheatmap(log2(markers_rna_norm+0.5),
cluster_rows = F,
cluster_cols = F,
annotation_row = markers[,"type", drop=F],
annotation_col = coldata_in[,"Stage", drop=F],
gaps_col = 7, fontsize = 8, show_colnames = F, main = "Cortical and striatal markers in iNs RNAseq")
rownames(protein) <- protein$gene_unique
markers_protein_norm <- protein[which(protein$gene_id %in% markers$markers),c("gene_id", coldata_in$Sample)]
rownames(markers_protein_norm) <- markers_protein_norm$gene_id
markers_protein_norm <- markers_protein_norm[markers$markers[which(markers$markers %in% markers_protein_norm$gene_id)],]
markers_protein_norm[markers_protein_norm == 0] <- NA
pheatmap(log2(markers_protein_norm[,-1]+0.5),
cluster_rows = F,
cluster_cols = F,
annotation_row = markers[,"type", drop=F],
# annotation_col = coldata_in[,"Stage", drop=F],
gaps_col = 7, fontsize = 8, show_colnames = F, main = "Cortical and striatal markers in iNs Proteomics")